| SOCR ≫ | DSPA ≫ | DSPA2 Topics ≫ |
#library(utility)
source("utility/R/apply_label_encoding.R")
source("utility/R/normalize_df.R")
source("utility/R/convert_dates_to_numeric.R")
library(progress)
library(data.table)
library(Rtsne)
library(ggplot2)
library(plotly)
library(lubridate)
library(dummies)
library(umap)
library(dplyr)
library(missForest)total_cols: 22940L total rows: 502493
library(readr)
zipFilePath <- "ukbb_data/ukb44534_compiled_tab-001.csv.zip"
csvFileName <- "ukb44534_compiled_tab-001.csv"
temp_data <- read.csv(unz(zipFilePath, csvFileName), sep = "\t", nrows = 1)
total_cols <- ncol(temp_data)
total_rows <- as.numeric(system("wc -l < ukbb_data/ukb44534_compiled_tab-001.csv", intern=TRUE))## Warning in system("wc -l < ukbb_data/ukb44534_compiled_tab-001.csv", intern =
## TRUE): running command 'wc -l < ukbb_data/ukb44534_compiled_tab-001.csv' had
## status 1
This code block sets up parallel processing capabilities, defines file paths, and initializes progress tracking. It includes a function to process each column of the dataset, determining the data type and saving summaries.
## Loading required package: iterators
## Loading required package: parallel
library(ggplot2)
cores <- detectCores() - 1
registerDoParallel(cores)
data_path <- "ukbb_data/random_data.csv"
progress_path <- "progress_log.csv"
total_cols <- 40L
summary_dir <- paste0("summaries/")
if (!dir.exists(summary_dir)) {
dir.create(summary_dir, recursive = TRUE)
}
if (file.exists(progress_path)) {
progress <- fread(progress_path)
start_col <- max(progress$col_processed) + 1
} else {
fwrite(data.table(col_processed = integer(0)), progress_path)
start_col <- 2
}
process_column <- function(col) {
column_titles <- fread(data_path, nrows = 1)
col_title <- names(column_titles)[col]
current_col_sample <- fread(data_path, select = col, nrows = 30)
detected_type <- "Other"
sample_values <- na.omit(current_col_sample[[1]])
if (length(sample_values) == 0) {
detected_type <- "Empty"
} else if (all(is.numeric(sample_values))) {
if (all(sample_values %in% c(0, 1))) {
detected_type <- "Binary"
current_col <- fread(data_path, select = col)
freq <- table(current_col[[1]], useNA = "ifany")
freq <- as.data.frame(freq)
freq$Prop <- freq$Freq / sum(freq$Freq)
if (!dir.exists("summaries_binary")) {
dir.create("summaries_binary")
}
fwrite(freq, paste0("summaries_binary/", col_title, ".csv"))
} else {
detected_type <- "Numeric"
current_col <- fread(data_path, select = col)
values = na.omit(current_col[[1]])
p <- ggplot(data.frame(values), aes(x = values)) +
geom_density() +
labs(title = paste("Data Distribution for", col_title))
# save plot
if (!dir.exists("summaries_binary")) {
dir.create("summaries_binary")
}
ggsave(paste0("summaries_numeric/", col_title, "_distribution.png"), plot = p)
}
} else if (any(grepl("^\\d{4}-\\d{2}-\\d{2}( \\d{2}:\\d{2}:\\d{2})?$", sample_values)) ||
any(grepl("^\\d{2}:\\d{2}:\\d{2}$", sample_values))) {
detected_type <- "Time"
} else {
current_col <- fread(data_path, select = col)
unique_categories <- length(unique(current_col[[1]]))
if (unique_categories > 50) {
detected_type <- "Text"
} else {
detected_type <- "Categorical"
freq <- table(current_col[[1]], useNA = "ifany")
freq <- as.data.frame(freq)
freq$Prop <- freq$Freq / sum(freq$Freq)
fwrite(freq, paste0("summaries/", col_title, ".csv"))
}
}
fwrite(data.table(col_processed = col), progress_path, append = TRUE)
return(list(colname = col_title, type = detected_type))
}
results <- foreach(col = start_col:total_cols, .combine = 'rbind', .packages = 'data.table') %dopar% {
process_column(col)
}## Warning in mclapply(argsList, FUN, mc.preschedule = preschedule, mc.set.seed =
## set.seed, : scheduled cores 2, 3, 4, 5 did not deliver results, all values of
## the jobs will be affected
This code block sets up for “Categorical” data visualization.
library(data.table)
summaries_dir <- "summaries/"
plots_dir <- paste0(summaries_dir, "plot/")
if (!dir.exists(plots_dir)) {
dir.create(plots_dir)
}
csv_files <- list.files(path = summaries_dir, pattern = "\\.csv$", full.names = TRUE)
for (file_path in csv_files) {
data <- fread(file_path)
if (nrow(data) > 1) {
pie_colors <- rainbow(nrow(data))
output_file_name <- paste0(plots_dir, basename(sub(".csv$", "", file_path)), ".png")
png(file = output_file_name)
pie(data$Freq, labels = paste0(data$Var1, ": ", round(data$Prop * 100, 1), "%"),
col = pie_colors, main = paste("Pie Chart for ", basename(file_path)))
dev.off()
} else {
cat("Not enough data to plot a pie chart for ", basename(file_path), "\n")
}
}This code block sets up for data visualization by reading a summary file and preparing a pie chart to visualize the distribution of data types.
file_path <- "column_types_summary.csv"
data <- fread(file_path)
type_counts <- table(data$type)
type_counts_df <- as.data.frame(type_counts)
names(type_counts_df) <- c("Type", "Frequency")
type_counts_df$Prop <- type_counts_df$Frequency / sum(type_counts_df$Frequency)
if (nrow(type_counts_df) > 1) {
pie_colors <- rainbow(nrow(type_counts_df))
# Save the pie chart to a file
png(file = "type_distribution_pie_chart.png")
pie(type_counts_df$Frequency, labels = paste0(type_counts_df$Type, ": ", round(type_counts_df$Prop * 100, 1), "%"),
col = pie_colors, main = "Pie Chart of Data Types")
dev.off()
# Display the pie chart in the R environment
pie(type_counts_df$Frequency, labels = paste0(type_counts_df$Type, ": ", round(type_counts_df$Prop * 100, 1), "%"),
col = pie_colors, main = "Pie Chart of Data Types")
} else {
cat("Not enough data to plot a pie chart.\n")
}# hierarchical storage
file_path <- "ukbb_data/ukb44534_compiled_fields.csv"
lines <- readLines(file_path)
features_list <- list()
for (line in lines) {
parts <- strsplit(line, "_")[[1]]
if (length(parts) > 3) {
feature_name <- paste(parts[1:(length(parts)-3)], collapse = "_")
if (!is.null(features_list[[feature_name]])) {
features_list[[feature_name]] <- c(features_list[[feature_name]], line)
} else {
features_list[[feature_name]] <- c(line)
}
}
}
# bag of words encoding
feature_names <- names(features_list)
words_list <- strsplit(feature_names, "_")
unique_words <- unique(unlist(words_list))
bag_of_words_matrix <- matrix(0, nrow = length(feature_names), ncol = length(unique_words))
colnames(bag_of_words_matrix) <- unique_words
for (i in 1:length(feature_names)) {
words <- words_list[[i]]
for (word in words) {
bag_of_words_matrix[i, word] <- 1
}
}
# hierarchical clustering
distance_matrix <- dist(bag_of_words_matrix, method = "euclidean")
hc <- hclust(distance_matrix)
plot(hc)clusters <- cutree(hc, k = 20)
cluster_sizes <- table(clusters)
barplot(cluster_sizes, main = "Cluster Sizes", xlab = "Cluster", ylab = "Size", col = rainbow(10))## Loading required package: Matrix
## Loading required package: stats4
## mi (Version 1.1, packaged: 2022-06-05 05:31:15 UTC; ben)
## mi Copyright (C) 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015 Trustees of Columbia University
## This program comes with ABSOLUTELY NO WARRANTY.
## This is free software, and you are welcome to redistribute it
## under the General Public License version 2 or later.
## Execute RShowDoc('COPYING') for details.
data <- fread('ukbb_data/1000_rows.csv', na.strings = c("", "NA", "na"))
data <- convert_dates_to_numeric(data)## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning: All formats failed to parse. No formats found.
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): target_number_to_be_entered_f4252_1_1 : cannot infer variable
## type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): number_entered_by_participant_f4258_0_12 : cannot infer
## variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): visual_acuity_result_in_round_left_f5082_1_8 : cannot infer
## variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): operation_code_f20004_2_4 : cannot infer variable type when all
## values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name):
## method_of_recording_time_when_noncancer_illness_first_diagnosed_f20013_2_11 :
## cannot infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): workplace_full_of_chemical_or_other_fumes_f22610_0_24 : cannot
## infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): breathing_problems_during_period_of_job_f22616_0_19 : cannot
## infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): values_wanted_f23321_2_78 : cannot infer variable type when all
## values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): values_wanted_f23321_2_104 : cannot infer variable type when
## all values are NA, guessing 'irrelevant'
## Loading required namespace: betareg
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name):
## mean_icvf_in_anterior_corona_radiata_on_fa_skeleton_right_f25366_3_0 : cannot
## infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name):
## mean_od_in_superior_corona_radiata_on_fa_skeleton_left_f25417_3_0 : cannot
## infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): volume_of_molecularlayerhphead_left_hemisphere_f26629_3_0 :
## cannot infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): mean_thickness_of_precentral_right_hemisphere_f27288_3_0 :
## cannot infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): histology_of_cancer_tumour_f40011_16_0 : cannot infer variable
## type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): external_ca_f41201_0_6 : cannot infer variable type when all
## values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): main_speciality_of_consultant_polymorphic_f41207_0_11 : cannot
## infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): date_of_first_inpatient_diagnosis_main_icd10_f41262_0_33 :
## cannot infer variable type when all values are NA, guessing 'irrelevant'
## Warning in .guess_type(y, favor_ordered, favor_positive, threshold,
## variable_name): prawns_intake_f103200_4_0 : cannot infer variable type when all
## values are NA, guessing 'irrelevant'
## NOTE: The following pairs of variables appear to have the same missingness pattern.
## Please verify whether they are in fact logically distinct variables.
## [,1] [,2]
## [1,] "triplet_played_left_f4229_0_9" "triplet_entered_left_f4236_0_8"
Read subset for toy analysis
data <- apply_label_encoding(data)
data <- normalize_df(data)
data <- data %>% select(which(colSums(is.na(data)) != nrow(data)) %>% names())
data <- missForest(data, verbose = TRUE)$ximp## missForest iteration 1 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.03137591
## time: 0.023 seconds
##
## missForest iteration 2 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.02204959
## time: 0.024 seconds
##
## missForest iteration 3 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.008215134
## time: 0.03 seconds
##
## missForest iteration 4 in progress...
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## Warning in randomForest.default(x = obsX, y = obsY, ntree = ntree, mtry = mtry,
## : The response has five or fewer unique values. Are you sure you want to do
## regression?
## done!
## estimated error(s): NaN
## difference(s): 0.01161644
## time: 0.023 seconds
This section performs a K-means clustering analysis. It identifies the optimal number of clusters using silhouette scores and show visualization of the of the K index parameter versus
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(ggplot2) # for plotting
set.seed(123)
optimal_k <- 2
highest_silhouette <- 0
sil_scores <- numeric(29) # Vector to store average silhouette scores for each k
for(k in 2:30) {
km.res <- kmeans(data_matrix, centers = k, nstart = 25)
ss <- silhouette(km.res$cluster, dist(data_matrix))
avg.sil <- mean(ss[, 3])
sil_scores[k-1] <- avg.sil # Store the silhouette score
if(avg.sil > highest_silhouette){
highest_silhouette <- avg.sil
optimal_k <- k
}
}
clusters <- kmeans(data_matrix, centers = optimal_k, nstart = 25)
cat("Optimal number of clusters:", optimal_k, "\n")## Optimal number of clusters: 3
## K-means clustering with 3 clusters of sizes 18, 38, 7
##
## Cluster means:
## number_of_operations_selfreported_f136_0_0 peak_expiratory_flow_pef_f3064_2_1
## 1 0.1527778 0.4142563
## 2 0.2500000 0.7435186
## 3 0.1607143 0.5635003
## triplet_played_left_f4229_0_9 triplet_entered_left_f4236_0_8
## 1 0.6033770 0.7410625
## 2 0.4014656 0.3229384
## 3 0.6184211 0.1305019
## ecg_stage_name_f5988_1_27 total_peripheral_resistance_during_pwa_f12685_2_0
## 1 0.5947222 0.8394972
## 2 0.1948684 0.7634160
## 3 0.3650000 0.7938677
## values_wanted_f23321_3_16
## 1 0
## 2 0
## 3 0
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## 1 0.4393431
## 2 0.7542243
## 3 0.6853954
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## 1 0.3615383
## 2 0.5761865
## 3 0.4959779
## urea_reportability_f30676_1_0
## 1 0
## 2 0
## 3 0
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## 1 0.2287167
## 2 0.7636872
## 3 0.4364463
## typical_diet_yesterday_f100020_4_0
## 1 0.8583333
## 2 0.9528947
## 3 0.1814286
##
## Clustering vector:
## [1] 2 1 2 2 2 2 3 2 2 2 2 2 3 3 1 2 2 2 3 2 2 1 3 2 2 1 2 2 2 1 1 2 2 1 1 2 1 2
## [39] 3 2 2 1 2 2 2 2 1 3 1 2 1 2 1 2 2 1 2 1 1 2 2 1 2
##
## Within cluster sum of squares by cluster:
## [1] 4.473613 7.621833 1.864694
## (between_SS / total_SS = 53.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Plotting the silhouette score vs. number of clusters
k_values <- 2:30
sil_plot <- ggplot(data = data.frame(k_values, sil_scores), aes(x = k_values, y = sil_scores)) +
geom_line() +
scale_x_continuous(breaks = 2:30) +
theme_minimal() +
ggtitle("Silhouette Score vs. Number of Clusters") +
xlab("Number of Clusters (k)") +
ylab("Average Silhouette Score")
print(sil_plot)Here, PCA, t-SNE, and UMAP techniques are applied for dimensionality reduction. The results are prepared for subsequent visualization.
# PCA
non_constant_data_matrix <- data_matrix[, apply(data_matrix, 2, var) != 0]
pca_result_2d <- prcomp(non_constant_data_matrix, scale. = TRUE, center = TRUE)
pca_2d <- pca_result_2d$x[, 1:2]
pca_3d <- pca_result_2d$x[, 1:3]
# t-sne
tsne_2d <- Rtsne(data_matrix, dims = 2, perplexity=20, verbose=TRUE) ## Performing PCA
## Read the 63 x 12 data matrix successfully!
## Using no_dims = 2, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.981104)!
## Learning embedding...
## Iteration 50: error is 51.406093 (50 iterations in 0.00 seconds)
## Iteration 100: error is 48.207531 (50 iterations in 0.00 seconds)
## Iteration 150: error is 52.296232 (50 iterations in 0.00 seconds)
## Iteration 200: error is 53.719866 (50 iterations in 0.00 seconds)
## Iteration 250: error is 50.869441 (50 iterations in 0.00 seconds)
## Iteration 300: error is 1.492406 (50 iterations in 0.00 seconds)
## Iteration 350: error is 1.142512 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.670267 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.266017 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.132867 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.101808 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.097840 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.097811 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.099061 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.097886 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.098196 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.095654 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.097428 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.096479 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.096071 (50 iterations in 0.00 seconds)
## Fitting performed in 0.05 seconds.
## Performing PCA
## Read the 63 x 12 data matrix successfully!
## Using no_dims = 3, perplexity = 20.000000, and theta = 0.500000
## Computing input similarities...
## Building tree...
## Done in 0.00 seconds (sparsity = 0.981104)!
## Learning embedding...
## Iteration 50: error is 50.772559 (50 iterations in 0.01 seconds)
## Iteration 100: error is 49.310880 (50 iterations in 0.01 seconds)
## Iteration 150: error is 50.131639 (50 iterations in 0.01 seconds)
## Iteration 200: error is 51.314426 (50 iterations in 0.01 seconds)
## Iteration 250: error is 50.295920 (50 iterations in 0.01 seconds)
## Iteration 300: error is 1.166688 (50 iterations in 0.00 seconds)
## Iteration 350: error is 0.595067 (50 iterations in 0.00 seconds)
## Iteration 400: error is 0.264928 (50 iterations in 0.00 seconds)
## Iteration 450: error is 0.131385 (50 iterations in 0.00 seconds)
## Iteration 500: error is 0.102971 (50 iterations in 0.00 seconds)
## Iteration 550: error is 0.098632 (50 iterations in 0.00 seconds)
## Iteration 600: error is 0.094107 (50 iterations in 0.00 seconds)
## Iteration 650: error is 0.098984 (50 iterations in 0.00 seconds)
## Iteration 700: error is 0.089372 (50 iterations in 0.00 seconds)
## Iteration 750: error is 0.089801 (50 iterations in 0.00 seconds)
## Iteration 800: error is 0.088156 (50 iterations in 0.00 seconds)
## Iteration 850: error is 0.087735 (50 iterations in 0.00 seconds)
## Iteration 900: error is 0.086534 (50 iterations in 0.00 seconds)
## Iteration 950: error is 0.085868 (50 iterations in 0.00 seconds)
## Iteration 1000: error is 0.089348 (50 iterations in 0.00 seconds)
## Fitting performed in 0.07 seconds.
pca_2d_df <- as.data.frame(pca_2d) # PCA 2d
pca_2d_df$cluster <- as.factor(clusters$cluster)
ggplot(pca_2d_df, aes(x = PC1, y = PC2, color = cluster)) + geom_point() + theme_minimal() + ggtitle("PCA 2D Visualization with Cluster Coloring")data_2d <- as.data.frame(tsne_2d$Y) # t-sne 2d
data_2d$cluster <- as.factor(clusters$cluster) # add cluster result
data_3d <- as.data.frame(tsne_3d$Y)
data_3d$cluster <- as.factor(clusters$cluster)
ggplot(data_2d, aes(x = V1, y = V2, color = cluster)) + geom_point() + theme_minimal() + ggtitle("t-SNE 2D Visualization with Cluster Coloring")data_3d <- as.data.frame(tsne_3d$Y)
data_3d$cluster <- as.factor(clusters$cluster)
data_2d <- data.frame(X = umap_result_2d$layout[,1], Y = umap_result_2d$layout[,2]) # umap 2d
data_2d$cluster <- as.factor(clusters$cluster)
ggplot(data_2d, aes(x = X, y = Y, color = cluster)) +
geom_point() +
theme_minimal() +
ggtitle("UMAP 2D Visualization with Cluster Coloring")data_3d <- data.frame(X = umap_result_3d$layout[,1], Y = umap_result_3d$layout[,2], Z = umap_result_3d$layout[,3])
data_3d <- data.frame(X = umap_result_3d$layout[,1], Y = umap_result_3d$layout[,2], Z = umap_result_3d$layout[,3])
data_3d$cluster <- as.factor(clusters$cluster)
fig <- plot_ly(data_3d, x = ~X, y = ~Y, z = ~Z, color = ~cluster, colors = RColorBrewer::brewer.pal(length(unique(data_3d$cluster)),"Dark2"), type = 'scatter3d', mode = 'markers')
figThis code performs feature selection by conducting pairwise t-tests between clusters for each feature in the data matrix, adjusting the p-values using the Benjamini-Hochberg method to identify statistically significant features. It creates a summary table to display significant features and their adjusted p-values.
results_ttest <- list()
feature_names <- colnames(data_matrix)
unique_clusters <- unique(clusters$cluster)
for(i in 1:ncol(data_matrix)) {
feature_results <- list()
for(j in 1:(length(unique_clusters)-1)) {
for(k in (j+1):length(unique_clusters)) {
group1 <- data_matrix[clusters$cluster == unique_clusters[j], i]
group2 <- data_matrix[clusters$cluster == unique_clusters[k], i]
if(var(group1) != 0 && var(group2) != 0) {
feature_results[[paste(unique_clusters[j], unique_clusters[k], sep="_")]] <- t.test(group1, group2)$p.value
} else {
feature_results[[paste(unique_clusters[j], unique_clusters[k], sep="_")]] <- NA
}
}
}
results_ttest[[feature_names[i]]] <- feature_results
}
all_p_values <- sapply(results_ttest, function(feature) {
min_p_value <- min(unlist(feature), na.rm = TRUE)
return(ifelse(is.infinite(min_p_value), NA, min_p_value))
})## Warning in min(unlist(feature), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
## Warning in min(unlist(feature), na.rm = TRUE): no non-missing arguments to min;
## returning Inf
p_adjusted <- p.adjust(all_p_values, method = "BH")
significant_features <- which(p_adjusted < 0.05)
summary_table <- data.frame(
Feature = feature_names[significant_features],
P_Value = p_adjusted[significant_features]
)
print(summary_table)## Feature
## peak_expiratory_flow_pef_f3064_2_1 peak_expiratory_flow_pef_f3064_2_1
## triplet_played_left_f4229_0_9 triplet_played_left_f4229_0_9
## triplet_entered_left_f4236_0_8 triplet_entered_left_f4236_0_8
## ecg_stage_name_f5988_1_27 ecg_stage_name_f5988_1_27
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0 mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0 volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12 date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## typical_diet_yesterday_f100020_4_0 typical_diet_yesterday_f100020_4_0
## P_Value
## peak_expiratory_flow_pef_f3064_2_1 2.247864e-11
## triplet_played_left_f4229_0_9 1.601329e-03
## triplet_entered_left_f4236_0_8 4.292043e-08
## ecg_stage_name_f5988_1_27 7.797267e-12
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0 9.541974e-05
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0 2.161192e-08
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12 2.202589e-20
## typical_diet_yesterday_f100020_4_0 1.442177e-04
This script applies the Boruta algorithm to identify important features in the data matrix, visualizing their importance using box-and-whisker plots, and then confirms the selection of significant features using the TentativeRoughFix function.
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:mi':
##
## complete
## The following objects are masked from 'package:Matrix':
##
## expand, pack, unpack
library(dplyr)
cluster_numbers <- clusters$cluster
cluster_factors <- as.factor(cluster_numbers)
set.seed(123) #
data_matrix_df <- as.data.frame(data_matrix)
boruta_output <- Boruta(cluster_factors ~ ., data=data_matrix_df, doTrace = 0)
library(plotly)
# plot(als, xlab="", xaxt="n")
# lz<-lapply(1:ncol(als$ImpHistory), function(i)
# als$ImpHistory[is.finite(als$ImpHistory[, i]), i])
# names(lz)<-colnames(als$ImpHistory)
# lb<-sort(sapply(lz, median))
# axis(side=1, las=2, labels=names(lb), at=1:ncol(als$ImpHistory), cex.axis=0.5, font = 4)
df_long <- tidyr::gather(as.data.frame(boruta_output$ImpHistory), feature, measurement)
plot_ly(df_long, x=~feature, y = ~measurement, color = ~feature, type = "box") %>%
layout(title="Box-and-whisker Plots across all features (UKBB Data)",
xaxis = list(title="Features", categoryorder = "total descending"),
yaxis = list(title="Importance"), showlegend=F)## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Boruta performed 99 iterations in 0.6807549 secs.
## Tentatives roughfixed over the last 99 iterations.
## 8 attributes confirmed important:
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12,
## ecg_stage_name_f5988_1_27,
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0,
## peak_expiratory_flow_pef_f3064_2_1, triplet_entered_left_f4236_0_8 and
## 3 more;
## 4 attributes confirmed unimportant:
## number_of_operations_selfreported_f136_0_0,
## total_peripheral_resistance_during_pwa_f12685_2_0,
## urea_reportability_f30676_1_0, values_wanted_f23321_3_16;
## number_of_operations_selfreported_f136_0_0
## Rejected
## peak_expiratory_flow_pef_f3064_2_1
## Confirmed
## triplet_played_left_f4229_0_9
## Confirmed
## triplet_entered_left_f4236_0_8
## Confirmed
## ecg_stage_name_f5988_1_27
## Confirmed
## total_peripheral_resistance_during_pwa_f12685_2_0
## Rejected
## values_wanted_f23321_3_16
## Rejected
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## Confirmed
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## Confirmed
## urea_reportability_f30676_1_0
## Rejected
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## Confirmed
## typical_diet_yesterday_f100020_4_0
## Confirmed
## Levels: Tentative Confirmed Rejected
## Loaded glmnet 4.1-8
## Call:
## knockoff.filter(X = data, y = as.numeric(cluster_factors), fdr = 0.8)
##
## Selected variables:
## number_of_operations_selfreported_f136_0_0
## 1
## peak_expiratory_flow_pef_f3064_2_1
## 2
## triplet_played_left_f4229_0_9
## 3
## triplet_entered_left_f4236_0_8
## 4
## ecg_stage_name_f5988_1_27
## 5
## total_peripheral_resistance_during_pwa_f12685_2_0
## 6
## mean_fa_in_anterior_corona_radiata_on_fa_skeleton_right_f25078_2_0
## 7
## volume_of_caudalmiddlefrontal_right_hemisphere_f27299_2_0
## 8
## date_of_first_inpatient_diagnosis_icd10_f41280_0_12
## 9
## typical_diet_yesterday_f100020_4_0
## 10